package com.ossean.util;

import info.debatty.java.stringsimilarity.NGram;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Resource;

import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import com.ossean.MergeProjects;
import com.ossean.TableName;
import com.ossean.dao.DBDest;
import com.ossean.dao.DBSource;
import com.ossean.dao.GatherDao;
import com.ossean.dao2.PlatformProjectDao;
import com.ossean.model.EddRelations;
import com.ossean.model.GatherProjectsModel;
import com.ossean.model.GithubProject;
import com.ossean.model.Synonymmings;
import com.ossean.model.Synonyms;
import com.ossean.model.Taggings;
import com.ossean.model.Tags;

@Component("mergeProjectNew2")
public class MergeProjectNew2 {
	Logger logger = Logger.getLogger(this.getClass());
	@Resource
	private DBSource dbSource;
	@Resource
	private GatherDao gatherDao;
	@Resource
	private PlatformProjectDao platformProjectDao;
	@Resource
	private DBDest dbDest;
	private String synonymsTableName = TableName.synonymsTableName;
	private String gatherProjectsTableName = TableName.gatherProjectsTableName;
	private String eddRelationTableName = TableName.eddRelationTableName;
	private String sourceforgeTableName = TableName.sourceforgeTableName;
	private String openhubTableName = TableName.openhubTableName;
	private String oschinaTableName = TableName.oschinaTableName;
	private static GatherProjectsModel gitModel = null;// gitModel表示重复关系的github项目中最有影响力的Model;先不加入关系中

	private static Set<Integer> removeSet;
	private static Set<Integer> relationSet;
	private static int handleCount;

	public List<String> getSynonyms(GatherProjectsModel model) {
		List<String> synonymsList = new ArrayList<String>();// 最终返回的同义词列表
		synonymsList = dbSource.getSynonymByPrjId(synonymsTableName,
				model.getId());
		return synonymsList;
	}

	// 处理新项目

	public int handleNewProject(GatherProjectsModel model, boolean isIncrese) {
		// 已缓存的列表中的数据，若已处理 则不重复处理
		if (gatherDao.selectGPMById(gatherProjectsTableName, model.getId())
				.getUpdate_mark() == 2)
			return 0;
		if (isIncrese) {
			logger.info("model " + model.getId() + " is increase data");
		}
		if (model.getSource().equals("github") )
			gitModel = model;
			
		relationSet = new HashSet<Integer>();// 有重复关系的项目集合
		removeSet = new HashSet<Integer>();// 已经处理的项目，包括单独插入的和有重复关系的
//long start = System.currentTimeMillis();
		relationSet = findDupProject(model, relationSet, isIncrese);
//long end = System.currentTimeMillis();
//logger.warn(" findDupProject cost: "+(float)(end - start)/60000+" minutes");
		if (gitModel != null)
			relationSet.add(gitModel.getId());
		OperateTable(relationSet, isIncrese);// 表操作
//long end1 = System.currentTimeMillis();
//logger.warn(" OperateTable cost: "+(float)(end1 - end)/60000+" minutes");
		gitModel = null;
		return removeSet.size();
	}

	// isIncrese=true对应增量数据，应扫全表判断
	public Set<Integer> findDupProject(GatherProjectsModel model,
			Set<Integer> relationSet, boolean isIncrese) {
		// 递归时检查
		if (gatherDao.selectGPMById(gatherProjectsTableName, model.getId())
				.getUpdate_mark() == 2)
			return relationSet;
		String prjName = model.getName();
		int prjId = model.getId();
		String prjHomepage = model.getHomepage();
//long start = System.currentTimeMillis();
		// apache社区没有homepage,用url代替
		if (prjHomepage == null || prjHomepage.equals(""))
			if (model.getSource().equals("apache"))
				prjHomepage = model.getUrl();

		// 项目重复匹配列表
		List<GatherProjectsModel> matchedList = new ArrayList<GatherProjectsModel>();
		List<GatherProjectsModel> sameHomepageList = new ArrayList<GatherProjectsModel>();
		// 处理homepage相同的项目，分别对和当前处理项目同一社区的、不同社区的进行处理
		List<GatherProjectsModel> sameHomeAndSourceList = new ArrayList<GatherProjectsModel>();
		List<Integer> matchedIdList = new ArrayList<Integer>();
		sameHomeAndSourceList.add(model);

		Map<String, List<GatherProjectsModel>> sameHomeMap = new HashMap<String, List<GatherProjectsModel>>();
		if (prjHomepage != null && !prjHomepage.equals("")) {
			String[] homepageArr = prjHomepage.split(";");
			for (String homepage : homepageArr) {
				homepage = deleteHttpPre(homepage);// 去掉“http&https”前缀及“/”后缀
				if (homepage.length() > 0) {
					// 为homepage添加http或https头
					String homepage1 = "http://" + homepage;
					String homepage2 = "https://" + homepage;
					String homepage3 = "http://" + homepage + "/";
					String homepage4 = "https://" + homepage + "/";
					if (isIncrese)
						sameHomepageList.addAll(gatherDao
								.selectGPMBySameHomePageForIncrease(
										gatherProjectsTableName, homepage,
										homepage1, homepage2, homepage3,
										homepage4, model.getId()));
					else
						sameHomepageList.addAll(gatherDao
								.selectGPMBySameHomePage(
										gatherProjectsTableName, homepage,
										homepage1, homepage2, homepage3,
										homepage4, model.getId(), 1));
				}
			}
			for (GatherProjectsModel m : sameHomepageList) {
//				logger.info("project " + model.getId() + " and " + m.getId()
//						+ " match by homepage");
				if (!m.getSource().equals(model.getSource())) {
					if (sameHomeMap.get(m.getSource()) == null)
						sameHomeMap.put(m.getSource(),
								new ArrayList<GatherProjectsModel>());
					sameHomeMap.get(m.getSource()).add(m);
				} else {
					sameHomeAndSourceList.add(m);
				}
			}
			// 同一社区，homepage相同，则选出最有影响力的，其余插表。
			if (sameHomeAndSourceList.size() != 0) {
				GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeAndSourceList);
				for (GatherProjectsModel tmp : sameHomeAndSourceList) {
					if (tmp.getId() != uniqueModel.getId()
							&& tmp.getId() != model.getId()
							&& tmp.getUpdate_mark() != 2) {
						// 插入关系表
						String relationStr = "," + tmp.getId() + ",";
						if(tmp.getSource().equals("github"))
							dbSource.insertEddRelations(eddRelationTableName,
								relationStr,0);
						else
							dbSource.insertEddRelations(eddRelationTableName,
									relationStr,1);
						gatherDao.updateMark(gatherProjectsTableName, 2,
								tmp.getId());
						// handleCount++;
						removeSet.add(tmp.getId());
					}
				}
				if (model.getId() != uniqueModel.getId()) {
					String relationStr = "," + model.getId() + ",";
					relationSet.add(model.getId());
					return relationSet;
				}
				else {
				  if(uniqueModel.getSource().equals("github")){
					  if (gitModel == null || compareGithubProject(uniqueModel, gitModel))
							gitModel = uniqueModel;
				  }
				 }

			}
			// 处理查找的项目有同一source的情况，确保一个source最多只有一个项目有相同homepage
			for (String uniqueSource : sameHomeMap.keySet()) {
				GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeMap
						.get(uniqueSource));
				if (uniqueModel.getSource().equals("github")) {
					if (gitModel == null
							|| compareGithubProject(uniqueModel, gitModel))
						gitModel = uniqueModel;
				}
				if (!relationSet.contains(uniqueModel.getId())
						&& !uniqueModel.getSource().equals("github")) {
					matchedList.add(uniqueModel);
					matchedIdList.add(uniqueModel.getId());
				}
				for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)) {
					if (tmp.getId() != uniqueModel.getId()
							&& tmp.getUpdate_mark() != 2) {
						// 插入关系表
						String relationStr = "," + tmp.getId() + ",";
						if(tmp.getSource().equals("github"))
							dbSource.insertEddRelations(eddRelationTableName,relationStr,0);
						else
							dbSource.insertEddRelations(eddRelationTableName,relationStr,1);
						gatherDao.updateMark(gatherProjectsTableName, 2,
								tmp.getId());
						removeSet.add(tmp.getId());
					}
				}
			}

		}
//long end = System.currentTimeMillis();
//logger.warn(" homepage cost: "+(float)(end - start)/60000+" minutes");
		// 处理同名项目;github项目不用处理同名的情况
		List<GatherProjectsModel> afterNamePrjList = new ArrayList<GatherProjectsModel>();
		List<Integer> afterNamePrjIdList = new ArrayList<Integer>();
		if (!model.getSource().equals("github")){
			List<GatherProjectsModel> sameNamePrjList;
			if (isIncrese) {
				sameNamePrjList = gatherDao.selectGPMBySameNameForIncrease(
						gatherProjectsTableName, prjName, prjId);
			} else {
				sameNamePrjList = gatherDao.selectGPMBySameName(
						gatherProjectsTableName, prjName, prjId, 1);
			}
			for (int i = 0; i < sameNamePrjList.size(); i++) {// 不重复比较
				int tmpId = sameNamePrjList.get(i).getId();
	//			logger.info("project " + model.getId() + " and "
	//					+ sameNamePrjList.get(i).getId() + " match by name");
				if (!relationSet.contains(tmpId) && !matchedIdList.contains(tmpId)) {// 未处理的
					afterNamePrjList.add(sameNamePrjList.get(i));
				}
			}
			afterNamePrjList = returnDupModelList(model, afterNamePrjList, true);
			for (GatherProjectsModel a : afterNamePrjList) {
				afterNamePrjIdList.add(a.getId());
			}
		}
//long end1 = System.currentTimeMillis();
//logger.warn(" name cost: "+(float)(end1 - end)/60000+" minutes");
		// 处理别名相同的项目
		List<String> synonymList = getSynonyms(model);// 获取项目的同义词
		List<GatherProjectsModel> synonymsPrjList = getRelatedGatherProjects(
				model, synonymList, isIncrese);
		List<GatherProjectsModel> afterSynonymsPrjList = new ArrayList<GatherProjectsModel>();
		for (int j = 0; j < synonymsPrjList.size(); j++) {// 不重复比较
			int tmpId = synonymsPrjList.get(j).getId();
//			logger.info("project " + model.getId() + " and "
//					+ synonymsPrjList.get(j).getId() + " match by synonyms");
			if (!relationSet.contains(tmpId)
					&& !afterNamePrjIdList.contains(tmpId)
					&& !matchedIdList.contains(tmpId))// 未处理的,也不同名，否则会重复计算
				afterSynonymsPrjList.add(synonymsPrjList.get(j));
		}
		afterSynonymsPrjList = returnDupModelList(model, afterSynonymsPrjList,
				false);

		afterSynonymsPrjList = mergeTwoList4Source(afterNamePrjList,
				afterSynonymsPrjList);
//long end2 = System.currentTimeMillis();
//logger.warn(" synonyms cost: "+(float)(end2 - end1)/60000+" minutes");
//		for (GatherProjectsModel m : afterSynonymsPrjList) {
//			logger.info("project " + model.getId() + " and " + m.getId()
//					+ " match by synonyms/name and verify");
//		}
		matchedList = mergeTwoList4Source(matchedList, afterSynonymsPrjList);// 同一社区的项目相同homepage比名字相同和别名相同优先级更高
		relationSet.add(model.getId());
		if (matchedList.size() == 0)
			return relationSet;

		// 递归调用有重复关系的项目
		for (int i = 0; i < matchedList.size(); i++) {
			GatherProjectsModel dupModel = matchedList.get(i);
			if (!relationSet.contains(dupModel.getId())) {
				relationSet.add(dupModel.getId());
				if (dupModel.getUpdate_mark() != 2)
					findDupProject(dupModel, relationSet, isIncrese);
			}
		}
		return relationSet;
	}

	// 分别对别名和关系进行插表操作：所有别名汇聚到正在处理的项目上(handlenewproject)。
	// 对于重复关系的项目标记update_mark=2，之后不再重复操作
	@Transactional(propagation = Propagation.REQUIRED)
	public void OperateTable(Set<Integer> theRelationSet, boolean isIncrease) {

		if (theRelationSet == null)
			return;

		String finallyRelationStr = ",";
		StringBuilder relationStr = new StringBuilder(",");

		List<EddRelations> eddRelationsList = new ArrayList<EddRelations>();
		List<GatherProjectsModel> mergeList = new ArrayList<GatherProjectsModel>();
		// 由于递归的原因，考虑可能多个还有来自同一社区的项目
		List<Integer> finalIdList = new ArrayList<Integer>();
		Map<String, List<GatherProjectsModel>> map = new HashMap<String, List<GatherProjectsModel>>();
		for (int id : theRelationSet) {// 该集合包括处理项目本身
			mergeList.add(gatherDao.selectGPMById(gatherProjectsTableName, id));
		}
		for (GatherProjectsModel model1 : mergeList) {
			if (map.get(model1.getSource()) == null)
				map.put(model1.getSource(),
						new ArrayList<GatherProjectsModel>());
			map.get(model1.getSource()).add(model1);
		}
		for (String uniqueSource : map.keySet()) {
			GatherProjectsModel uniqueModel = getUniqueSourceModel(map
					.get(uniqueSource));
			for (GatherProjectsModel tmp : map.get(uniqueSource)) {
				if (tmp.getId() != uniqueModel.getId()) {
					// 插入关系表
					String tmpStr = "," + tmp.getId() + ",";
					if (tmp.getUpdate_mark() != 2) {
						if(tmp.getSource().equals("github"))
							dbSource.insertEddRelations(eddRelationTableName,tmpStr,0);
						else
							dbSource.insertEddRelations(eddRelationTableName,tmpStr,1);
						gatherDao.updateMark(gatherProjectsTableName, 2,tmp.getId());
						removeSet.add(tmp.getId());
					}
				}
			}
			finalIdList.add(uniqueModel.getId());
			finallyRelationStr = finallyRelationStr + uniqueModel.getId() + ",";
			relationStr.append(uniqueModel.getId());
			relationStr.append(",");
		}
		// 增量数据的处理
		//if (isIncrease) {

		for (int id : finalIdList) {// 从关系表取出重复关系
			EddRelations tmpRelation = dbSource
					.getEddRelationsByGatherProjectsId(
							eddRelationTableName, id);
			if (tmpRelation != null)
				eddRelationsList.add(tmpRelation);
		}

		if (eddRelationsList != null && eddRelationsList.size() != 0) {
			for (EddRelations relation : eddRelationsList) {
				String tmp = StringHandler.removeFirstComma(relation
						.getGather_projects_ids());
				relationStr.append(tmp);
			}
		}
		finallyRelationStr = StringHandler.removeDupIdInStr(relationStr
				.toString());
		//logger.info(finallyRelationStr);

		//}
		try {
			if (!finallyRelationStr.equals(",")) {
				for (int id : finalIdList) {
					GatherProjectsModel model1 = gatherDao.selectGPMById(
							gatherProjectsTableName, id);
					if (model1.getUpdate_mark() != 2) {
						gatherDao.updateMark(gatherProjectsTableName, 2, id);
						removeSet.add(model1.getId());
					}
				}
				dbSource.insertEddRelations(eddRelationTableName,
						finallyRelationStr,0);
				//if (isIncrease)// 增量数据的关系，删除以前的，插入最新的关系
				for (EddRelations relation : eddRelationsList) {
					//logger.info("delete before relations");
					dbSource.deleteEddRelationsItem(eddRelationTableName,
							relation.getId());
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
			logger.info(finallyRelationStr);
		}

	}

	// 找到当前项目同义词关联的项目 要求项目不相同 因为同一个项目可能包含多个同义词
	public List<GatherProjectsModel> getRelatedGatherProjects(
			GatherProjectsModel model, List<String> synonymList,
			boolean isIncrease) {

		List<GatherProjectsModel> result = new ArrayList<GatherProjectsModel>();
		Map<Integer, GatherProjectsModel> map = new HashMap<Integer, GatherProjectsModel>();
		GatherProjectsModel tmp;
		for (String name : synonymList) {
			List<Integer> prjList = dbSource.getSynonymBySynonyms(
					synonymsTableName, name);
			for (int prjId : prjList) {
				if (prjId == model.getId())
					continue;
				if (map.containsKey(prjId))
					continue;
				tmp = gatherDao.selectGPMById(gatherProjectsTableName, prjId);
				if (!isIncrease && tmp.getUpdate_mark() == 2)
					continue;
				map.put(prjId, tmp);
			}
		}
		for (GatherProjectsModel value : map.values()) {
			result.add(value);
		}
		return result;
	}

	public List<GatherProjectsModel> returnDupModelList(
			GatherProjectsModel model, List<GatherProjectsModel> list,
			boolean samename) {
		List<GatherProjectsModel> returnList = new ArrayList<GatherProjectsModel>();
		for (GatherProjectsModel dupModel : list) {
			if (!dupModel.getSource().equals(model.getSource())) {
				if (isTheSame(model, dupModel, samename))
					returnList.add(dupModel);
			}
		}
		// 如果重复列表有来自同一社区的，则再进行比较相似度
		List<GatherProjectsModel> afterReturnList = new ArrayList<GatherProjectsModel>();
		Map<String, List<GatherProjectsModel>> map = new HashMap<String, List<GatherProjectsModel>>();
		GatherProjectsModel uniqueModel = new GatherProjectsModel();
		for (GatherProjectsModel model1 : returnList) {
			if (map.get(model1.getSource()) == null)
				map.put(model1.getSource(),
						new ArrayList<GatherProjectsModel>());
			map.get(model1.getSource()).add(model1);
		}
		for (String uniqueSource : map.keySet()) {
			uniqueModel = getUniqueSourceModel(map.get(uniqueSource));

			for (GatherProjectsModel tmp : map.get(uniqueSource)) {
				if (tmp.getId() != uniqueModel.getId()
						&& tmp.getUpdate_mark() != 2) {
					// 插入关系表
					String relationStr = "," + tmp.getId() + ",";
					if(tmp.getSource().equals("github"))
						dbSource.insertEddRelations(eddRelationTableName,relationStr,0);
					else
						dbSource.insertEddRelations(eddRelationTableName,relationStr,1);
					gatherDao.updateMark(gatherProjectsTableName, 2,
							tmp.getId());
					removeSet.add(tmp.getId());
				}
			}
			if (!uniqueModel.getSource().equals("github"))
				afterReturnList.add(uniqueModel);
			else{
				if(gitModel == null || compareGithubProject(uniqueModel, gitModel))
					gitModel = uniqueModel;
			}

		}
		return afterReturnList;

	}

	public boolean compareGithubProject(GatherProjectsModel model1,
			GatherProjectsModel model2) {
		try{
			if (platformProjectDao.getGithubPrjByUrl(TableName.githubTableName,model1.getUrl())
					.getStargazers_count() > platformProjectDao.getGithubPrjByUrl(TableName.githubTableName,
					model2.getUrl()).getStargazers_count())
				return true;
			else
				return false;
		} catch (Exception e) {
			System.out.println(model1.getUrl());
			System.out.println(model2.getUrl());
			//System.exit(0);
			return false;
		}

	}

	public GatherProjectsModel getUniqueSourceModel(
			List<GatherProjectsModel> list) {
		GatherProjectsModel returnModel = new GatherProjectsModel();
		String source = list.get(0).getSource();
		if (list.size() == 1)
			returnModel = list.get(0);
		else {
			GatherProjectsModel tmp = list.get(0);
			for (GatherProjectsModel model1 : list) {
				if (model1.getUrl() == null || model1.getUrl().equals(""))
					continue;
				if (source.equals("apache") || source.equals("freecode"))
					returnModel = list.get(0);
				if (source.equals("oschina")) {
					try {
						int a = platformProjectDao.getOschinaPrjByUrl(
								oschinaTableName, model1.getUrl())
								.getStar_num();
						int b = platformProjectDao.getOschinaPrjByUrl(
								oschinaTableName, tmp.getUrl()).getStar_num();
						if (a > b) {
							tmp = model1;
						}
					} catch (Exception e) {
						System.out.println(model1.getSource() + " ==== "
								+ model1.getUrl());
						System.out.println(tmp.getSource() + " ==== "
								+ tmp.getUrl());
					}
				}
				if (source.equals("openhub")) {
					try {
						int a = platformProjectDao.getOpenHubPrjByUrl(
								openhubTableName, model1.getUrl())
								.getUser_num();
						int b = platformProjectDao.getOpenHubPrjByUrl(
								openhubTableName, tmp.getUrl()).getUser_num();
						if (a > b) {
							tmp = model1;
						}
					} catch (Exception e) {
						System.out.println(model1.getSource() + " ==== "
								+ model1.getUrl());
						System.out.println(tmp.getSource() + " ==== "
								+ tmp.getUrl());
					}
				}
				if (source.equals("sourceforge")) {
					try {
						int a = platformProjectDao.getSourceForgePrjByUrl(
								sourceforgeTableName, model1.getUrl())
								.getDownload_num();
						int b = platformProjectDao.getSourceForgePrjByUrl(
								sourceforgeTableName, tmp.getUrl())
								.getDownload_num();
						if (a > b) {
							tmp = model1;
						}
					} catch (Exception e) {
						System.out.println(model1.getSource() + " ==== "
								+ model1.getUrl());
						System.out.println(tmp.getSource() + " ==== "
								+ tmp.getUrl());
					}

				}
			}
			returnModel = tmp;
		}
		return returnModel;
	}

	public boolean isTheSame(GatherProjectsModel model1,
			GatherProjectsModel model2, boolean sameName) {
		double similarity;
		int similarTagNum;
		boolean isEqual = false;
		if (model1.getSource().equals(model2.getSource()))
			return false;
		// sourceforge描述信息为空时默认为No description...汇总应该直接处理为空
		if (model1.getSource().equals("sourceforge")) {
			if (model1.getDescription() != null
					&& !model1.getDescription().equals(""))
				if (model1.getDescription().equals(
						"No description " + model1.getName() + " Web Site"))
					model1.setDescription(null);
		}
		if (model2.getSource().equals("sourceforge")) {
			if (model2.getDescription() != null
					&& !model2.getDescription().equals(""))
				if (model2.getDescription().equals(
						"No description " + model2.getName() + " Web Site"))
					model2.setDescription(null);
		}
		// oschina的tag和description都为中文
		if (model1.getSource().equals("oschina")
				|| model2.getSource().equals("oschina")) {
			if (sameName) {
				if (model1.getSource().equals("oschina"))
					isEqual = StringHandler.compareLanguage(
							model1.getLanguage(), model2.getLanguage());
				if (model2.getSource().equals("oschina"))
					isEqual = StringHandler.compareLanguage(
							model2.getLanguage(), model1.getLanguage());
				if (isEqual)
					return true;
			}
		}
		// 获取两个对象的tags属性
		String tags1 = model1.getTags();
		String tags2 = model2.getTags();
		similarTagNum = calSimilarityByTag(model1, model2);
		// System.out.println("calSimilarityByTag "+model1.getId()+" and "+model2.getId()+" : "+String.valueOf(similarTagNum));
		if (sameName) {// 相同名字的项目
			if (similarTagNum >= 1)
				return true;
			else {
				similarity = calSimilarity(model1, model2);
				// System.out.println("same name to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
				if (similarity > 0.1)
					return true;
				else
					return false;
			}
		} else {// 相同别名的项目
			if (similarTagNum > 1)
				return true;
			else {
				similarity = calSimilarity(model1, model2);
				// System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
				if (similarity >= 0.3)
					return true;
				else
					return false;
			}
		}

	}

	public double calSimilarity(GatherProjectsModel model1,
			GatherProjectsModel model2) {
		if (model1.getDescription() == null
				|| model1.getDescription().equals("")
				|| model2.getDescription() == null
				|| model2.getDescription().equals(""))
			return 0;
		NGram ngram = new NGram(4);
		return (1 - ngram.distance(model1.getDescription(),
				model2.getDescription()));
	}

	// 计算都有标签的项目之间的相似度
	public int calSimilarityByTag(GatherProjectsModel model1,
			GatherProjectsModel model2) {
		if (model1.getTags() == null || model1.getTags().equals("")
				|| model2.getTags() == null || model2.getTags().equals(""))
			return 0;
		String[] tagArray_model1 = model1.getTags().split(",");
		String[] tagArray_model2 = model2.getTags().split(",");
		if (model1.getSource().equals("sourceforge"))
			tagArray_model1 = StringHandler
					.splitTagsByBracket(model1.getTags());
		if (model2.getSource().equals("sourceforge"))
			tagArray_model2 = StringHandler
					.splitTagsByBracket(model2.getTags());
		List<String> tags_model1 = changeArrayToList(tagArray_model1);
		List<String> tags_model2 = changeArrayToList(tagArray_model2);
		int sameNum = getSameTagsNum(tags_model1, tags_model2);
		return sameNum;
	}

	public List<String> changeArrayToList(String[] strs) {
		List<String> result = new ArrayList<String>();
		for (String str : strs) {
			str = RegexHandler.extractEngDecimalAndChinese(str);// 对标签进行处理
			result.add(str);
		}
		return result;
	}

	public String deleteHttpPre(String homepage) {
		if (homepage != null && !"".equals(homepage)) {
			homepage = homepage.trim();
			int index_http = homepage.indexOf("http://");
			int index_https = homepage.indexOf("https://");
			if (index_http == 0) {
				homepage = homepage.substring(index_http + 7); // 去除http://
			} else if (index_https == 0) {
				homepage = homepage.substring(index_https + 8); // 去除https://
			} else {
				// 表示不是以http或https开头
			}

			if ((homepage.lastIndexOf("/") == homepage.length() - 1)
					&& (homepage.length() != 0)) {
				homepage = homepage.substring(0, homepage.length() - 1);// 如果homepage的最后一个字符是/
																		// 需要去除掉
			}
		}
		return homepage;
	}

	public static boolean isRightUrlByRegex(String urlStr) {
		Pattern p = Pattern.compile(
				"http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?",
				Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(urlStr);
		if (m.find()) {
			// System.out.println(m.group());
			return true;
		} else
			return false;
	}

	public List<String> removeDupStr(List<String> array) {
		List<String> list = new ArrayList<String>();
		Set<String> set = new HashSet<String>();
		for (String s : array) {
			if (set.add(s.toLowerCase())) {
				list.add(s);
			}
		}
		return list;
	}

	// 将两个list<GatherProjectsModel>进行合并
	public List<GatherProjectsModel> mergeTwoList(
			List<GatherProjectsModel> list1, List<GatherProjectsModel> list2) {
		List<GatherProjectsModel> result = new ArrayList<GatherProjectsModel>();
		Set set = new HashSet<Integer>();
		for (GatherProjectsModel model : list1) {
			if (set.add(model.getId()))
				result.add(model);
		}
		for (GatherProjectsModel model : list2) {
			if (set.add(model.getId()))
				result.add(model);
		}
		return result;
	}

	public List<GatherProjectsModel> mergeTwoList4Source(
			List<GatherProjectsModel> list1, List<GatherProjectsModel> list2) {
		List<GatherProjectsModel> result = new ArrayList<GatherProjectsModel>();
		Set set = new HashSet<Integer>();
		Set sourceSet = new HashSet<String>();
		for (GatherProjectsModel model : list1) {
			sourceSet.add(model.getSource());
			if (set.add(model.getId()))
				result.add(model);
		}
		for (GatherProjectsModel model : list2) {
			if (set.add(model.getId()) && sourceSet.add(model.getSource()))
				result.add(model);
		}
		return result;
	}

	public int getSameTagsNum(List<String> list1, List<String> list2) {
		int count = 0;
		for (String str : list1) {
			if (isExist(str, list2))
				count++;
		}
		return count;

	}

	public boolean isExist(String input, List<String> list) {
		boolean b = false;
		for (String str : list) {
			if (str.toLowerCase().equals(input.toLowerCase()))
				return true;
		}
		return false;
	}

	public int includeSubStringNum(String str, String subStr) {
		int count = 0;
		if (str.indexOf(subStr) == -1) {
			return 0;
		} else if (str.indexOf(subStr) != -1) {
			count++;
			count += includeSubStringNum(
					str.substring(str.indexOf(subStr) + subStr.length()),
					subStr);
			return count;
		}
		return 0;
	}

	public static void main(String[] args) {
		ApplicationContext applicationContext = new ClassPathXmlApplicationContext(
				"classpath:/applicationContext*.xml");
		MergeProjectNew2 Main = applicationContext
				.getBean(MergeProjectNew2.class);
		Main.test();
	}

	public void test() {
		GatherProjectsModel model = gatherDao.selectGPMById(
				gatherProjectsTableName, 543);
		handleNewProject(model, false);
	}
}
